/* This dofile converts the raw input data into Stata-ready data
 1. Harmonize data across years and categories
 2. Generate samples for code testing
 3. Convert text data to Stata format
 4. Calculate reference prices per category
 5. Calculate reset prices per category
 
Batcher code for parallel execution (see ssc install batcher): 
	batcher F:\MonPol Project\ReplicationPackage\Dofiles\a0_ImportData.do, it(1/6) betweendelay(10) t(C:/StataWD)

*/

* Technicalities
** Parameters
global parallel "1"		// 1 or 6
global sample ""		// "" or _sample
global testRun ""		// "" or 1

** Steps to run
global harmonizeData ""
	global dropExternal  "0"		// Only if you get data straight from IRI
	global fixPaptowls   "0"		// Only if you get data straight from IRI
	global fixToothpa    "0"		// Only if you get data straight from IRI
	global missingMilk   "0"		// Only if you get data straight from IRI
	global chainCrossref "1"
global generateSamples ""
global convertData ""
	global parseStores    "1"
	global parsePriceData "1"
global calcPrices ""
global calcResetPrices ""
global storeIdCrosswalk "1"

** Paths
global rootL "F:\MonPol Project\ReplicationPackage\Large"
global rootD "F:\MonPol Project\ReplicationPackage"

** Programs
qui do "$rootD/Dofiles/999_Paths"
qui do "$rootD/Dofiles/998_Programs"

** Log
cap log close log
if "$parallel" == "1" log using "$path_logs/a0_ImportData${sample}.smcl", append smcl name(log)
else log using "$path_logs/a0_ImportData${sample}_`1'.smcl", append smcl name(log)

** Split categories over computation threads ("embarrasingly parallel execution")
if "$parallel" == "1" || "`1'" == "" local categories "beer blades cigets coffee carbbev factiss diapers fzpizza hhclean hotdog coldcer deod fzdinent razors photo paptowl mayo peanbutr margbutr mustketc milk laundet shamp saltsnck spagsauc sugarsub toitisu toothbr toothpa yogurt soup"
else {
	global thread = `1'
	if $thread == 1 local categories "beer blades cigets deod"
	if $thread == 2 local categories "fzpizza hhclean hotdog coldcer"
	if $thread == 3 local categories "soup razors photo paptowl mayo peanbutr margbutr mustketc milk"
	if $thread == 4 local categories "yogurt spagsauc sugarsub toitisu toothbr toothpa"
	if $thread == 5 local categories "coffee laundet saltsnck shamp"
	if $thread == 6 local categories "factiss carbbev diapers fzdinent"
	
	if "`categories'" == "" {
		noisily di "Not assigned to category"
		exit
	}
}

if "$testRun" == "1" {
	if "$parallel" == "1" local categories "razors photo"
	else {
		if $thread == 1 local categories "razors"
		elif $thread == 2 local categories "photo"
		else {
			noisily di "Not assigned to category"
			exit
		}
	}
}

** Folders
cap mkdir "$path_intermediate"
cap mkdir "$path_intermediate/xrefs"
cap mkdir "$rootL/Data/Raw_sample"
cap mkdir "$path_input"
cap mkdir "$path_input/storeData"
cap mkdir "$path_input/priceData${sample}"
cap mkdir "$path_intermediate/priceData${sample}"
cap mkdir "$path_output"
cap mkdir "$path_output/completePriceData${sample}"
cap mkdir "$path_output/resetPrices${sample}"

* Code
** Harmonize data
if "$harmonizeData" == "1" {
	* In some years, data is in .../External/..., in others, it is not
	* We get rid of the (useless) /External/
	if "$dropExternal" == "1" {
		foreach year of numlist 1/7 12 {
			foreach category of local categories {
				!move "$path_raw\Year`year'\External/`category'" "$path_raw\Year`year'/`category'"
			}
		}
	}
	
	* Paptowl is written as paptowls in years 8/11
	if "$fixPaptowls" == "1" {
		foreach year of numlist 8/11 {
			!move "$path_raw\Year`year'/paptowls" "$path_raw\Year`year'/paptowl"
		}
	}	
	
	* Toothpa in year12 is in a separate folder within toothpa
	if "$fixToothpa" == "1" {
		local allfiles : dir "$path_raw\Year12/toothpa/toothpa" files "*", respectcase
		foreach file of local allfiles {
			di `"!move "$path_raw\Year12/toothpa/toothpa/`file'" "$path_raw\Year12/toothpa/`file'""'
			copy "$path_raw\Year12/toothpa/toothpa/`file'" "$path_raw\Year12/toothpa/`file'", replace
		}
	}	
	
	* Copy in the missing milk
	if "$missingMilk" == "1" {
		copy "$path_input/milk_drug_1427_1478" "$path_raw/Year7/milk/milk_drug_1427_1478", replace
		copy "$path_raw/Year7/diapers/Delivery_Stores" "$path_raw/Year7/milk/Delivery_Stores", replace
	}
	
	* Generate chain cross-reference file
	if "$chainCrossref" == "1" {
		** Import cross reference files
		import delimited "$path_raw\Extra\masked_chain_xref1_12.csv", clear stringcols(_all)
		
		** Generate unique chain ID (chain1 in year1 is different from chain1 in year8)
		gen year1_old = year1
		replace year1 = "year1_" + year1 if missing(year1) == 0
		
		forvalues year = 1/12 {
			gen temp_year`year' = "year`year'_" + year`year' if missing(year`year') == 0
			replace year1 = temp_year`year' if missing(year1)
			drop temp_year`year'
		}
		encode year1, gen(chainID_unique)
		drop year1
		rename year1_old year1

		** Align (yearly) chain names with dataset
		order chainID_unique year1, first

		forvalues year = 1/12 {
			replace year`year' = "Chain" + year`year' if missing(year`year') == 0
		}
		
		
		** Save complete xref file
		save "$path_intermediate/xrefs/chains_all", replace

		* Create individual xref files
		forvalues year = 1/12 {
			preserve
			keep chainID_unique year`year'
			rename year`year' chainID
			drop if missing(chainID)
			save "$path_intermediate/xrefs/chains_year`year'", replace
			restore
		}
	}
}

** Generate samples
if "$generateSamples" == "1" {
	noisily di "Generating manageable samples"
	
	qui forvalues year = 1/12 { 
		noisily di as result "Year: `year'"
		foreach category of local categories {
			noisily di _col(3) as text "`category'"
			cap mkdir "$rootL/Data/Raw_sample/Year`year'"
			cap mkdir "$rootL/Data/Raw_sample/Year`year'/"
			cap mkdir "$rootL/Data/Raw_sample/Year`year'/External"
			cap mkdir "$rootL/Data/Raw_sample/Year`year'/`category'"
			
			* Detect drug & groc-store files
			local drugfile : dir "$path_raw/Year`year'/`category'" files "*_drug_*"
			local grocfile : dir "$path_raw/Year`year'/`category'" files "*_groc_*"
			local allfiles : list drugfile | grocfile
			foreach file of local allfiles {
				* Load file
				infile long iri_key int week str2 (sy ge) str5 (vend item) int units float dollar str4 f byte (d pr) using "$path_raw/Year`year'/`category'/`file'." in 1/10000, clear
				
				* Save reduced file in appropriate location [*]
				outfile using "$rootL/Data/Raw_sample/Year`year'/`category'/`file'.", replace
				
				* Copy delivery stores file [*]
				copy "$path_raw/Year`year'/`category'/Delivery_Stores" "$rootL/Data/Raw_sample/Year`year'/`category'/Delivery_Stores", replace				
			}	// Closes drug/groc
		} // Closes categories
	} // Closes year
}

** Convert data
if "$convertData" == "1" {
	noisily di "Converting text data to useable Stata data"
	
	* Store information (unique per year, common across categories)
	if "$parseStores" == "1" & inlist("$thread", "", "1") {
		forvalues year = 1/12 {
			** Load store info (same for all categories, so use beer because it's beer)
			infix long iri_key 1-8 str2 ou 9-11 est_acv 12-20 str25 market 21-45 int open 46-50 int closed 51-55 str8 chainID 56-64 using "$rootL/Data/Raw${sample}/Year`year'/beer/Delivery_Stores.", clear
			drop in 1
			
			** Open and close dates
			gen int open_stata = open*7 + 7185 - 7
			gen int closed_stata = closed*7 + 7185 - 7
			format open_stata closed_stata %tdMon_DD,_CCYY
			
			** Unique chainID
			merge m:1 chainID using "$path_intermediate/xrefs/chains_year`year'", nogen keep(match master)
			
			** Clean up
			drop ou est_acv
			
			** Expand to weekly (necessary for merge, faster than using rangejoin due to I/O)
			local firstweek = 1114 + (`year'-1)*52 - 50
			local lastweek = 1114 + `year'*52 + 50
			replace open = `firstweek' if open < `firstweek'
			replace closed = `lastweek' if closed > `lastweek'
			
			gen weeksInSample = closed - open
			duplicates tag iri_key, gen(dub)
			
			sort iri_key
			egen unid = group(iri_key open)
			expand 400
			
			bysort unid: gen week = `firstweek' - 1 + _n
			drop if ~inrange(week, open, closed)
			
			** Save
			save "$path_input/storeData/storeInfo_year`year'", replace
		}
	}
	
	* Price data (this is what it's all about)
	if "$parsePriceData" == "1" {
		timer on 10
		** Convert text files to .dta files with upc- and store data
		noisily di "Converting text files."
		qui forvalues year = 1/12 {
			noisily di as result "Year: `year'"
			foreach category of local categories {
				noisily di _col(3) as text "`category': `c(current_time)'"
				*** Detect drug & groc-store files
				local drugfile : dir "$rootL/Data/Raw${sample}/Year`year'/`category'" files "*_drug_*"
				local grocfile : dir "$rootL/Data/Raw${sample}/Year`year'/`category'" files "*_groc_*"
				local allfiles : list drugfile | grocfile
				foreach file of local allfiles {
					**** Load file
					infile long iri_key int week str2 (sy ge) str5 (vend item) double units double dollar str4 f byte (d pr) using "$rootL/Data/Raw${sample}/Year`year'/`category'/`file'.", clear
					drop in 1
					
					**** Assemble data
					***** Grocery vs Drug store
					gen groc_dummy = (strpos("`file'", "groc") > 0)
					
					/***** Category
					gen category = "`category'"*/
					
					***** Generate UPC code (reduce file size)
					replace sy = "0" + sy if length(sy) < 2
					replace ge = "0" + ge
					forvalues i = 1/4 {
						replace vend = "0" + vend if length(vend) < 5
						replace item = "0" + item if length(item) < 5
					}
					gen upc_code = sy + "-" + ge + "-" + vend + "-" +item
					drop sy ge vend item
					
					***** Stata date
					gen int week_stata = week*7 + 7185 - 7
					format week_stata %tdMon_DD,_CCYY
					label var week_stata "Start of the week in stata format"
					rename week week_iri
					rename week_stata week
					
					assert week >= td(01jan2001)
									
					**** Clean up [*]
					drop f d
					drop if missing(dollar)
					
					**** Save single files
					cap mkdir "$path_input/priceData${sample}/`category'"
					save "$path_input/priceData${sample}/`category'/`file'", replace
				}
			}	// Category
		}	// Year
		timer off 10
		
		** Combine .dta files to category level files
		noisily di _newline "Combining drug/groc and year files to single category file."
		timer on 20
		qui foreach category of local categories {
			noisily di as result "`category'"
			
			*** Identify files and append
			clear
			local allFilesOfCategory : dir "$path_input/priceData${sample}/`category'" files "*.dta"
			foreach file of local allFilesOfCategory {
				append using "$path_input/priceData${sample}/`category'/`file'", nolabel
			}
			
			
			*** Identify private label products
			gen privateLabel = substr(upc_code, 1, 2) == "88"
			
			*** Encode upc_code
			noisily di as text _col(3) "Encoding ..."
			encode upc_code, gen(upc_code2)
			drop upc_code
			rename upc_code2 upc_code
			
			*** Generate id_nr
			noisily di as text _col(3) "Generating ID ..."
			gegen double id_nr = group(upc_code iri_key)
			label var id_nr "Product-Store combination"
			
			*** Fix private label id_nrs
			replace id_nr = id_nr + 0.1 if privateLabel == 1 & week >= td(01jan2007)
			replace id_nr = id_nr + 0.1 if privateLabel == 1 & week >= td(01jan2008)
			replace id_nr = id_nr + 0.1 if privateLabel == 1 & week >= td(01jan2012)
			drop privateLabel
			
			*** Generate price
			gen double price = dollar/units
			
			*** Order
			order id_nr upc_code week, first
			order price, after(week)
			
			sort id_nr week
			
			*** Save
			noisily di as text _col(3) "Saving ..."
			save "$path_intermediate/priceData${sample}/`category'.dta", replace
		}
		timer off 20
		noisily timer list
	} 
}

** Calculate prices
if "$calcPrices" == "1" {
	noisily di "Calculating reference and regular prices"
	
	* Set parameters
	local wind = 13											// Length of window
	local cutoff = 1/3										// Minimum percentage deviate from previous mode
	local acc = 0.5											// Minimum percentage non-missing
	local duration = 12*52 + 1
	local max_cycles = `duration'/`wind'

	* Calculate prices
	noisily di "Calculating reference prices."
	qui foreach category of local categories {
		noisily di as result _newline _newline "`category': `c(current_time)'"
		** Load data
		use "$path_intermediate/priceData${sample}/`category'", clear
		replace price = round(price, 0.01)
		
		** Prep data [*]
		xtset id_nr week, delta(7 days)
		rename pr ind_sale_IRI
	
		gen int year = year(week)
		gen int month = mofd(week)
		format month %tm
		
		bysort id_nr (week): gen gap = week - week[_n-1]
		drop units ind_sale_IRI groc_dummy week_iri
			
		** Calculate reference prices
		noisily di as text _col(3) "Reference prices ..."
		
		*** Generate observation number within window
		bysort id_nr (week): egen int obs_count = count(week)
		drop if obs_count < `wind'
		
		gen obs_number = 1
		replace obs_number = obs_number[_n-1] + 1 if id_nr[_n] == id_nr[_n-1]			// Observation number within product_store

		*** Placeholder variables
		gen double modalprice = 0
		gen fraction = 0

		*** Calculate rolling mode
		local wind = 13
		qui foreach roll of numlist 1/`wind' {																// Rolling windows
			if real("`roll'") < 10 noisily di in yellow  _newline " `roll': " _continue 
			if real("`roll'") >= 10 noisily di in yellow  _newline "`roll': " _continue 
			gen roll`roll'_cycle = 0
			foreach cycle of numlist 1/`max_cycles' {														// Creates window
				replace roll`roll'_cycle = `cycle' if obs_number-(`roll'-1) <= `cycle' * `wind' & obs_number-(`roll'-1) > (`cycle'-1) * `wind'
				noisily di in yellow  "." _continue
			}
			
			bysort id_nr roll`roll'_cycle (week): egen mode`roll' = mode(price), maxmode					// Calculate mode per cycle
			gen equal_to_mode`roll' = 1 if mode`roll' == price												// 1 if mode equals price
			gen is_missing`roll' = missing(price)
			bysort id_nr roll`roll'_cycle (week): egen sum_missing`roll' = total(is_missing`roll')			// Count number of obs where mode equals price
			bysort id_nr roll`roll'_cycle (week): egen sum_equal`roll' = total(equal_to_mode`roll')			// Count number of obs where mode equals price
			replace mode`roll' = 0 if sum_missing`roll' > `acc' * `wind'										// Set mode to missing if it occurred less often than acc * window
			replace modalprice = mode`roll' 			if obs_number == roll`roll'_cycle * `wind' + `roll' - (`wind'/2 + 0.5) 	
			replace fraction = sum_equal`roll'/`wind' 	if obs_number == roll`roll'_cycle * `wind' + `roll' - (`wind'/2 + 0.5) 	
			drop equal_to_mode`roll' is_missing`roll' mode`roll' sum_equal`roll' sum_missing`roll' roll`roll'_cycle
		}
		
		*** Turn modes into reference prices
		**** Reference price = modalprice if it's valid, else just previous ref price (note that the first modalprice is always defined)
		gen double price_ref = 0
		bysort id_nr (week): replace price_ref = modalprice 			if _n == 1
		bysort id_nr (week): replace price_ref = modalprice 			if fraction > `cutoff' & modalprice == price
		bysort id_nr (week): replace price_ref = price_ref[_n-1] 		if (fraction < `cutoff' | modalprice != price) & _n != 1	// This one I find disturbing...
		order price_ref, after(modalprice)

		**** Correct for timing discrepancy in change in actual price and mode
		foreach i in 1/(`wind' / 2 - 0.5) {
			bysort id_nr (week): replace price_ref = price_ref[_n+1] 	if price_ref != price_ref[_n+1] & price_ref[_n+1] == price & _n != obs_count & _n != 1
		}

		**** Set ref price to zero if missing actual price 
		replace price_ref = . if missing(price)
		
		**** Correct timing discrepancy for price change if missing in between
		***** One week missing in between
		bysort id_nr (week): replace price_ref = price_ref[_n+2]	if 	price == price_ref[_n+2]				///
																		&	price[_n+2] == price_ref[_n+2]				///
																		&	missing(price_ref[_n+1])					///
																		&	_n < (obs_count - 2)

		***** Two weeks missing in between														
		bysort id_nr (week): replace price_ref = price_ref[_n+3]	if 	price == price_ref[_n+3]				///
																		&	price[_n+3] == price_ref[_n+3]				///
																		&	missing(price_ref[_n+1])					///
																		&	missing(price_ref[_n+2])					///
																		&	_n < (obs_count - 3)
		***** Three weeks missing in between														
		bysort id_nr (week): replace price_ref = price_ref[_n+4]	if 	price == price_ref[_n+4]				///
																		&	price[_n+4] == price_ref[_n+4]				///
																		&	missing(price_ref[_n+1])					///
																		&	missing(price_ref[_n+2])					///
																		&	missing(price_ref[_n+3])					///
																		&	_n < (obs_count - 4)

		***** Four weeks missing in between														
		bysort id_nr (week): replace price_ref = price_ref[_n+5]	if 	price == price_ref[_n+5]				///
																		&	price[_n+5] == price_ref[_n+5]				///
																		&	missing(price_ref[_n+1])					///
																		&	missing(price_ref[_n+2])					///
																		&	missing(price_ref[_n+3])					///
																		&	missing(price_ref[_n+4])					///
																		&	_n < (obs_count - 5)														

		**** Correct for timing discrepancy in change in actual price and mode again (after the missing weeks correction)
		foreach i in 1/(`wind' / 2 - 0.5) {
			bysort id_nr (week): replace price_ref = price_ref[_n+1] 	if price_ref != price_ref[_n+1] & price_ref[_n+1] == price & _n != obs_count & _n != 1
		}
		
		
		*** Cleanup
		label var price_ref "Kehoe & Midrigan reference Price"
		drop obs_count obs_number modalprice fraction
		
		** Trim last month of regular and reference price (per id_nr) [TD]
		save "$path_output/completePriceData${sample}/`category'", replace
	}
}

if "$calcResetPrices" == "1" {
	foreach category of local categories {	
		* Load data
		use "$path_output/completePriceData${sample}/`category'", replace

		* Reduce to monthly indicator
		rename price price_raw
		
		** Calculate mode [*]
		cap rename price price_raw 		// Did not call it price_raw in the sample data
		global prices "price_raw price_ref"
		qui foreach price of global prices {
			noisily di "`price'"
			bysort id_nr month: egen double `price'Mode = mode(`price'), maxmode
		}
		
		** Dollars
		bysort id_nr month: egen double dollarSum = total(dollar)
		
		** Collapse
		gegen tag = tag(id_nr month)
		drop if tag == 0
		drop tag
		drop $prices dollar
		
		** Rename
		rename *Mode *
		rename *Sum *
		
		* Drop unused variables and observations
		** Retain based
		generateRetainIndicator
		drop if retain == 0
		
		** Valid based
		drop gap
		bysort id_nr (month): gen gap = month - month[_n-1]
		gen valid1 = (gap <= 1)
		
		bysort id_nr (month): drop if valid[_n+1] == 0 & valid == 0
		bysort id_nr (month): drop if valid == 0 & _n == _N
		
		** Variables
		drop upc_code iri_key retain week end* start* gap
		cap drop week_iri units ind_sale_IRI groc_dummy
		
		* Adapt naming [*]
		rename (price_raw price_ref) (P PRF)
		
		* Month indicator
		gen int t = month - 491
		sum t, meanonly
		global last_t = `r(max)'

		* Reset Price Calculation [*]
		global prices_short "P PRF"
		qui foreach price of global prices_short {
			noisily di "`price'"
			
			** Prep
			*** Identify price changes (relative to previous observation)
			replace `price' = log(`price')
			bysort 	id_nr (month): gen byte D_`price' = (`price' != `price'[_n-1]) if _n != 1 & valid1 == 1

			*** Reset price equals current price if the price changed
			gen `price'_reset = `price' if D_`price' == 1
			
			*** Weight used to calculate reset price inflation (nonmissing for price changers only)
			bysort id_nr year (month): egen ATR = total(dollar)
			bysort month (id_nr): egen tot_ATR_r = total(ATR) if missing(`price'_reset) == 0
			gen `price'_weight_reset = ATR/tot_ATR_r if missing(`price'_reset) == 0
			drop ATR tot_ATR_r

			*** Set reset price to actual price for those with gap >1 month
			bysort id_nr (month): replace `price'_reset = `price' if valid == 0 & valid[_n+1] == 1			// Note1
			
			*** Store reset price inflation in a matrix
			matrix define DL_`price'_reset = (0)

			** Reset price & inflation calculation
			local last_t = ${last_t}
			qui foreach month of numlist 2/`last_t' {
				* Show progress
				noisily di %4s = "`month' " _continue
				if `month'/30 == trunc(`month'/30) noisily di " |"
					
				* Calculate product level reset price inflation
				bysort id_nr (month): gen DL_`price'_reset_idnr = `price'_reset - `price'_reset[_n-1]	if t == `month'
				
				* Calculate weighted average category level reset price inflation
				sum DL_`price'_reset_idnr [aweight=`price'_weight_reset], meanonly
				local DL_`price'_reset_`month' = r(sum)
				matrix DL_`price'_reset = (nullmat(DL_`price'_reset) \ r(sum))

				* Update reset prices of idnrs that didn't change their price
				replace `price'_reset = `price'_reset[_n-1] + `DL_`price'_reset_`month''	if D_`price' != 1	& valid == 1 & t == `month'
				
				* Clean up and restart
				drop DL_`price'_reset_idnr
			}
			noisily di _newline
		}
		
		* Clean up [*]
		keep id_nr month P PRF P_reset PRF_reset
		
		* Obtain reset price inflations [*]
		svmat DL_P_reset
		svmat DL_PRF_reset
		gen DL_P_month = _n + 491 if _n <= 144
		format DL_P_month %tm
		
		* Save
		save "$path_output/resetPrices${sample}/resetPrices_`category'", replace		// Note: based on v1_r data
	}
}


if "$storeIdCrosswalk" == "1" {
    * Start log
    cap log close storeIdCrosswalk
    log using "$path_logs/storeIdCrosswalk.log", append text name(storeIdCrosswalk)
	cap mkdir "$path_intermediate${sample}"
	cap mkdir "$path_intermediate${sample}/crosswalk_store_idnr"
	cap mkdir "$path_intermediate${sample}/crosswalk_store_idnr/byCategory"
	
	* Loop over categories
	qui foreach category of local categories {	
		* Load data
		noisily di "`category'"
		use "$path_output/completePriceData${sample}/`category'", replace	
	
		* Reduce to id>store combination
		keep id_nr upc_code iri_key
		gduplicates drop
		
		* Save
		gen category = "`category'"
		encodeCategories
		defineCategoryLabel
		label values category categoryLabel
		save "$path_intermediate${sample}/crosswalk_store_idnr/byCategory/`category'", replace
	}
	
	* Combine categories
	local filelist: dir "$path_intermediate${sample}/crosswalk_store_idnr/byCategory" files "*.dta"
	local filecount : word count `filelist'
	if `filecount' == 31 {
	    clear
		foreach filename of local filelist {
		    append using "$path_intermediate${sample}/crosswalk_store_idnr/byCategory/`filename'"
		}
		save "$path_intermediate${sample}/crosswalk_store_idnr/crosswalk_store_idnr", replace
	}
	
		
    
    * Close log
    log close storeIdCrosswalk
}


log close log
